#!/usr/bin/env python
# coding: utf-8

# In "Leveraging Researcher Domain Expertise_Replication Code_A", we laid out the procedure for the simulation of our method on the New York Times Front Page dataset, as this best-emulates the use-case for political science research.
# 
# Here, we lay out two additional simulation schemes also noted within our manuscript that were run on the 20 NewsGroups dataset.
# 
# Specifically, we noted the effectiveness of our method in two class distribution schemes - on the original, raw dataset and on a dataset where we skew the balance between categories - in each run choosing a single category to undersample, and three other categories to remain much larger (at their original size). 
# 
# Here, we lay out the steps for the second distribution scheme (as the first one simply runs on the existing class scheme and distribution).
# 
# May 2022

# # Load the text embeddings:

# We use here the same SentenceTransformers model as those used in the previous script, however, the 20 NewsGroups dataset consists of short text segments. Thus, when we converted each one of the segments to an embedding, we kept them as is, without converting them to the document level. 

# In[ ]:


import pandas as pd
import os
import numpy as np
import pickle


# In[ ]:


### DIRECTORY WITH THE DATASET:
## change to the directory location on user computer

DIR = r"E:\Simulation_Replication Materials"


# In[ ]:


## Load the embeddings for the corpus
embds = os.path.join(DIR, "20newsgroups_forSimulation_Embeddings.pickle")

pickle_in = open(embds,"rb")
embeddings = pickle.load(pickle_in)


# In[ ]:


## Load the CSV with the metadata for each embedding - text, category, etc.

df = pd.read_csv(os.path.join(DIR, '20newsgroups_forSimulation_Metadata.csv'))


# In[ ]:


(len(embeddings))


# # Running the Simulations

# In[ ]:


### Add the document embeddings to the dataframe

df['X'] = [np.array(x) for x in embeddings]


# In[ ]:


### list of categories:

CATEGORIES = list(set(df['Category']))


# In[ ]:


import random

np.random.seed(17)


# In[ ]:


## function to randomly choose categories for classification:

def choose_random_categories(list_of_categories, number_of_cats):
    return(random.sample(list_of_categories, number_of_cats))


# In[ ]:


### Due to their already being an imbalance, 
### we prepare two lists of the categories to choose from:

## smaller categories
potential_rare = ['sport','automobile','religion','medicine',
                 'sales','alt.atheism']

## larger categories
potential_freq = ['computer','science','politics']


# In[ ]:


### undersample small categories, maintain consistent large samples for bigger categories:
def filter_categories_for_pool(rare_cats, freq_cats, df):
    total_df = pd.DataFrame()
    
    for cat_ in rare_cats:
        temp = df[df['Category'] == cat_]
        temp = temp.sample(n=400)
        total_df = pd.concat([total_df, temp])
        
    for cat_ in freq_cats:
        temp = df[df['Category'] == cat_]
        temp = temp.sample(n=2600)
        total_df = pd.concat([total_df, temp])
    
    return total_df


# In[ ]:


## Simulation parameters:

## How many times to run the simulation:
RUNS = 300

## How many samples to extract at each sampling iteration:
STEPSIZE = 18

## Number of iterations for each simulation run 
## (The more iterations, the more sampling rounds executed)
NUM_STEPS = 100


# In[ ]:


## Imports for simulation functions:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.stats import entropy
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

## TQDM Notebook - not required, but allows the user to track simulation progress:
from tqdm import tqdm_notebook


# In[ ]:


# trains an SVM on X and y and returns the classifier
def train_svm(X, y, proba=False): # add hyperparameter tuning
    clf = SVC(probability=proba) # ASK GUY IF NEED TRUE
    clf.fit(X, y)
    return clf

# note - for avg_calculation can do either "macro", "micro" or "weighted"
# note - removed accuracy from the function

def eval_svm(clf, testX, testY, avg_calculation):
    predY = clf.predict(testX)
    f1 = f1_score(testY, predY, average=avg_calculation)
    return f1


def calculate_centroids(df, categories):
    centroids_ = []
    
    for c in categories:
        temp = df[df['y'] == c]
        vecs = []
        for i,r in temp.iterrows():
            vecs.append(r.X)
        meanvec = np.mean(vecs, axis=0)
        centroids_.append(meanvec)
        
    return centroids_

def plot_f1s(performance_dict):
    ### graph the f1 scores by category:
    plt.figure(figsize=(12,7))
    plt.xlabel(str('Samples'), fontsize=14, fontweight='bold')
    plt.ylabel(str('f1'), fontsize=14, fontweight='bold')
    for k in performance_dict.keys():
        plt.plot(samples, performance_dict[k], label=k)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Class (# samples in Test Set)')
    plt.tight_layout()
    plt.show()

def create_classification_report(clf, testX, testY, output_dict=True):
    predY = clf.predict(testX)
    results = classification_report(le.inverse_transform([int(x) for x in testY]), le.inverse_transform([int(x) for x in predY]), output_dict=output_dict)
    return results

def get_weights(last_perfs, inflation_factor = 1.5):
# get weights for guided search based on latest per-category f1 scores, as saved in last_perfs
# inflation_faction determines the gap in probability of sampling between categories with good and 
# bad f1 scores
# probabilities are calculates as softmax(f1*inflation_factor)
    f1s = []
    for k in last_perfs.keys():
        if k in [str(x) for x in le.classes_.tolist()]:
            #print(k)
            f1s.append((1-last_perfs[k]["f1-score"])*inflation_factor)
    return softmax(f1s)


# In[ ]:


### create the function for the actual running of the methods:

def run_annotations(pooldf, filtered_categories, test, STEPSIZE=STEPSIZE, NUM_STEPS=NUM_STEPS):
    # lists to store results for each iteration
    samples = [] # number of samples
    choices = []

    # f1 values for each strategy
    f1s_random = []
    f1s_active = []
    f1s_ours_original = []
    f1s_ours_closer = []

    # pool of unannotated texts for each strategy
    pooldf_ours_original = pooldf.copy()
    pooldf_ours_closer = pooldf.copy()
    pooldf_random = pooldf.copy()
    pooldf_active = pooldf.copy()

    # core set of annotated texts for each strategy
    annotated_ours_original = pooldf_ours_original.groupby("y").sample(20) #  - start with samples from all
    annotated_ours_closer = annotated_ours_original.copy()
    annotated_random = pooldf_random.sample(annotated_ours_original.shape[0])
    if len(set(annotated_random['y'])) == 1:
        annotated_random.drop(annotated_random.tail(1).index,inplace=True)
        annotated_random = annotated_random.append(pooldf_ours_original.groupby("y").sample(1))
    annotated_active = annotated_random.copy()

    # remove annotated texts from pool
    pooldf_random = pooldf_random.drop(annotated_random.index)
    pooldf_active = pooldf_active.drop(annotated_active.index)
    pooldf_ours_closer = pooldf_ours_closer.drop(annotated_ours_closer.index)
    pooldf_ours_original = pooldf_ours_original.drop(annotated_ours_original.index)
    
    # main loop - number of iterations
    for i in (range(NUM_STEPS)):
        ### add in the first step as taking the preliminary sample collection
        
        if i == 0:
            annotated_ours_original = annotated_ours_original
            annotated_ours_closer = annotated_ours_closer
            annotated_random = annotated_random
            annotated_active = annotated_active
        
        elif i > 0:

            ############################ OUR METHOD - ORIGINAL #################################
            ### our method - sample from each strata for a category chosen at random

            c = calculate_centroids(annotated_ours_original, filtered_categories)
            similarities = cosine_similarity(np.array(pooldf_ours_original["X"].to_list()), c)

            for ii, cc in enumerate(filtered_categories):
                pooldf_ours_original[cc] = [x[ii] for x in similarities]

            strata_o = [(0.8, 0.9), (0.7, 0.8), (0.6, 0.7), (0.5, 0.6), (0.4, 0.5), 
                      (0.3 ,0.4)]
 

            ### randomly choose category
            cat_choice = random.choice(filtered_categories)
            #calculate empirical strata for category
            emp_strata = [np.quantile(pooldf_ours_original[str(cat_choice)], [x, y]) for x, y in strata_o]

            new_ours_orig = []

            for strata in strata_o:
                # select only subset corresponding to strata
                temp = pooldf_ours_original[(pooldf_ours_original[str(cat_choice)] > strata[0]) & (pooldf_ours_original[str(cat_choice)] <= strata[1])]

                if len(temp) < (STEPSIZE / (len(strata_o))):
                    newsamp = temp
                    new_ours_orig.append(newsamp)
                else:
                    newsamp = temp.sample(int(STEPSIZE / (len(strata_o))))
                    new_ours_orig.append(newsamp)
                # remove it from pool
                pooldf_ours_original = pooldf_ours_original.drop(newsamp.index)

            new_ours_original = pd.concat(new_ours_orig)

            ############################ OUR METHOD - MODIFIED STRATA #################################
            ### our method - sample from each strata for a category chosen at random

            c = calculate_centroids(annotated_ours_closer, filtered_categories)
            similarities = cosine_similarity(np.array(pooldf_ours_closer["X"].to_list()), c)

            for ii, cc in enumerate(filtered_categories):
                pooldf_ours_closer[cc] = [x[ii] for x in similarities]


            ## closer strata for larger corpus
            strata_o = [(0.95, 1), (0.9, 0.95), (0.85, 0.9), (0.8, 0.85), (0.7, 0.8), 
                      (0.6 ,0.7)]   

            ### randomly choose category
            cat_choice = random.choice(filtered_categories)
            #calculate empirical strata for category
            emp_strata = [np.quantile(pooldf_ours_closer[str(cat_choice)], [x, y]) for x, y in strata_o]

            new_ours_closer = []

            for strata in strata_o:
                # select only subset corresponding to strata
                temp = pooldf_ours_closer[(pooldf_ours_closer[str(cat_choice)] > strata[0]) & (pooldf_ours_closer[str(cat_choice)] <= strata[1])]
                ## sample:

                if len(temp) < (STEPSIZE / (len(strata_o))):
                    newsamp = temp
                    new_ours_closer.append(newsamp)
                else:
                    newsamp = temp.sample(int(STEPSIZE / (len(strata_o))))
                    new_ours_closer.append(newsamp)
                # remove it from pool
                pooldf_ours_closer = pooldf_ours_closer.drop(newsamp.index)

            new_ours_closer = pd.concat(new_ours_closer)


            ############################ RANDOM SAMPLING #############################################
            new_random = pooldf_random.sample(STEPSIZE)
            pooldf_random = pooldf_random.drop(new_random.index)

            ############################ ACTIVE LEARNING #############################################
            clf = train_svm(annotated_active["X"].to_list(), annotated_active["y"].to_list(), proba=True)
            pooldf_active["metric"] = entropy(clf.predict_proba(pooldf_active["X"].to_list()).transpose())
            pooldf_active = pooldf_active.sort_values(by="metric", ascending=False)
            new_active = pooldf_active.iloc[0:STEPSIZE]
            pooldf_active = pooldf_active.drop(new_active.index)


            ###############################################################################################
            ###################################### END SAMPLING STAGE #####################################
            ###############################################################################################

            # add newly sampled texts to each of the training sets
            annotated_ours_original = pd.concat([annotated_ours_original, new_ours_original])
            annotated_ours_closer = pd.concat([annotated_ours_closer, new_ours_closer])
            annotated_random = pd.concat([annotated_random, new_random])
            annotated_active = pd.concat([annotated_active, new_active])

        # train models
        model_ours_original = train_svm(annotated_ours_original["X"].to_list(), annotated_ours_original["y"].to_list())
        model_ours_closer = train_svm(annotated_ours_closer["X"].to_list(), annotated_ours_closer["y"].to_list())
        model_random = train_svm(annotated_random["X"].to_list(), annotated_random["y"].to_list())
        model_active = train_svm(annotated_active["X"].to_list(), annotated_active["y"].to_list())
    
        # evaulate and save results
        f1s_ours_closer.append(create_classification_report(model_ours_closer, test["X"].to_list(), test["y"]))
        f1s_ours_original.append(create_classification_report(model_ours_original, test["X"].to_list(), test["y"]))
        f1s_random.append(create_classification_report(model_random, test["X"].to_list(), test["y"]))
        f1s_active.append(create_classification_report(model_active, test["X"].to_list(), test["y"]))

        # save number of samples
        samples.append(annotated_random.shape[0])
        
        
        to_return = [f1s_random, f1s_active, f1s_ours_original, f1s_ours_closer, samples]

    return to_return


# In[ ]:


### add column with integers for categorical variables:

## convert categories to integer labels
from sklearn import preprocessing
le = preprocessing.LabelEncoder()


# In[ ]:


def run_simulation(pooldf, RUNS=RUNS, STEPSIZE=STEPSIZE, NUM_STEPS=NUM_STEPS):
    
    ## First, create the df
    ## Choose random categories from each list
    rare_categories = choose_random_categories(potential_rare, 1)
    freq_categories = choose_random_categories(potential_freq, 3)
    
    ## Output as single list
    categories_ = rare_categories + freq_categories
    
    ## Convert the df, including ratios
    converted_ = filter_categories_for_pool(rare_categories, freq_categories, pooldf)

    ## Assigning numerical values and storing in another column
    converted_['y'] = le.fit_transform(converted_['Category'])
    
    ### Train-Test split:
    X_pool, X_test, y_pool, y_test = train_test_split(converted_['X'], converted_['y'], test_size=0.20, random_state=42)
    y_pool = [str(x) for x in y_pool]
    y_test = [str(x) for x in y_test]
    pooldf = pd.DataFrame({"X": X_pool.tolist(), "y": y_pool})
    test = pd.DataFrame({"X": X_test.tolist(), "y": y_test})
    
    new_categories = list(set(pooldf['y']))
    
    ## Run the simulation:
    outputs = run_annotations(pooldf, new_categories, test=test)
    
    return outputs


# In[ ]:


### Create dictionaries to hold all information for each run
## Each key will be a run and the values will correspond to the f1 scores and samples
## for each parallel method of the simulation

random_dict = {}
active_dict = {}
ours_dict_closer = {}
ours_original_dict = {}
samples_dict = {}


for run in tqdm_notebook(range(RUNS)):
    outputs = run_simulation(df) # Return the outputs from the simulation's run
    random_dict.setdefault(str(run),outputs[0])
    active_dict.setdefault(str(run),outputs[1])
    ours_original_dict.setdefault(str(run),outputs[2])
    ours_dict_closer.setdefault(str(run),outputs[3])
    samples_dict.setdefault(str(run),outputs[4])


# # Visualizing the Results

# In[ ]:


## Find the mean vector for each dictionary (each dictionary corresponds to an annotation strategy)
def extract_macros_mean(results_dict):
    vecs = []
    for row in results_dict:
        vec = []
        for i in (results_dict[row]):
            vec.append(i['macro avg']['f1-score'])
        vecs.append(vec)
    mean_ = np.mean([v for v in vecs], axis=0)
    return mean_


# In[ ]:


## Visualize the performances

import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
## Note, sample size remains consistent throughout the multiple runs
plt.plot(samples_dict['0'], extract_macros_mean(random_dict), label='Random Sampling')
plt.plot(samples_dict['0'], extract_macros_mean(active_dict), label='Active Learning')
plt.plot(samples_dict['0'], extract_macros_mean(ours_original_dict), label='Our Method')
plt.plot(samples_dict['0'], extract_macros_mean(ours_dict_closer), label='Our Method - modified')
plt.xlabel(str('# of training samples'), fontsize=14, fontweight='bold')
plt.ylabel(str('f1 (macro)'), fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.setp(plt.gca().get_legend().get_texts(), fontsize='14')
plt.tight_layout()

### To save the graph:
# plt.savefig('Comparing_Annotation_Approaches.png')


# In[ ]:




